@@ -161,78 +161,46 @@ module Agents |
||
161 | 161 |
log "Storing new result for '#{name}': #{doc.inspect}" |
162 | 162 |
create_event :payload => doc |
163 | 163 |
end |
164 |
- else |
|
165 |
- output = {} |
|
166 |
- interpolated['extract'].each do |name, extraction_details| |
|
167 |
- case extraction_type |
|
168 |
- when "text" |
|
169 |
- regexp = Regexp.new(extraction_details['regexp']) |
|
170 |
- result = [] |
|
171 |
- doc.scan(regexp) { |
|
172 |
- result << Regexp.last_match[extraction_details['index']] |
|
173 |
- } |
|
174 |
- log "Extracting #{extraction_type} at #{regexp}: #{result}" |
|
175 |
- when "json" |
|
176 |
- result = Utils.values_at(doc, extraction_details['path']) |
|
177 |
- log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
|
178 |
- else |
|
179 |
- case |
|
180 |
- when css = extraction_details['css'] |
|
181 |
- nodes = doc.css(css) |
|
182 |
- when xpath = extraction_details['xpath'] |
|
183 |
- doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds |
|
184 |
- nodes = doc.xpath(xpath) |
|
185 |
- else |
|
186 |
- error '"css" or "xpath" is required for HTML or XML extraction' |
|
187 |
- return |
|
188 |
- end |
|
189 |
- case nodes |
|
190 |
- when Nokogiri::XML::NodeSet |
|
191 |
- result = nodes.map { |node| |
|
192 |
- case value = node.xpath(extraction_details['value']) |
|
193 |
- when Float |
|
194 |
- # Node#xpath() returns any numeric value as float; |
|
195 |
- # convert it to integer as appropriate. |
|
196 |
- value = value.to_i if value.to_i == value |
|
197 |
- end |
|
198 |
- value.to_s |
|
199 |
- } |
|
200 |
- else |
|
201 |
- error "The result of HTML/XML extraction was not a NodeSet" |
|
202 |
- return |
|
203 |
- end |
|
204 |
- log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
|
205 |
- end |
|
206 |
- output[name] = result |
|
164 |
+ next |
|
165 |
+ end |
|
166 |
+ |
|
167 |
+ output = |
|
168 |
+ case extraction_type |
|
169 |
+ when 'json' |
|
170 |
+ extract_json(doc) |
|
171 |
+ when 'text' |
|
172 |
+ extract_text(doc) |
|
173 |
+ else |
|
174 |
+ extract_xml(doc) |
|
207 | 175 |
end |
208 | 176 |
|
209 |
- num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq |
|
177 |
+ num_unique_lengths = interpolated['extract'].keys.map { |name| output[name].length }.uniq |
|
210 | 178 |
|
211 |
- if num_unique_lengths.length != 1 |
|
212 |
- error "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" |
|
213 |
- return |
|
214 |
- end |
|
179 |
+ if num_unique_lengths.length != 1 |
|
180 |
+ raise "Got an uneven number of matches for #{interpolated['name']}: #{interpolated['extract'].inspect}" |
|
181 |
+ end |
|
215 | 182 |
|
216 |
- old_events = previous_payloads num_unique_lengths.first |
|
217 |
- num_unique_lengths.first.times do |index| |
|
218 |
- result = {} |
|
219 |
- interpolated['extract'].keys.each do |name| |
|
220 |
- result[name] = output[name][index] |
|
221 |
- if name.to_s == 'url' |
|
222 |
- result[name] = (response.env[:url] + result[name]).to_s |
|
223 |
- end |
|
183 |
+ old_events = previous_payloads num_unique_lengths.first |
|
184 |
+ num_unique_lengths.first.times do |index| |
|
185 |
+ result = {} |
|
186 |
+ interpolated['extract'].keys.each do |name| |
|
187 |
+ result[name] = output[name][index] |
|
188 |
+ if name.to_s == 'url' |
|
189 |
+ result[name] = (response.env[:url] + result[name]).to_s |
|
224 | 190 |
end |
191 |
+ end |
|
225 | 192 |
|
226 |
- if store_payload!(old_events, result) |
|
227 |
- log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
228 |
- create_event :payload => result |
|
229 |
- end |
|
193 |
+ if store_payload!(old_events, result) |
|
194 |
+ log "Storing new parsed result for '#{name}': #{result.inspect}" |
|
195 |
+ create_event :payload => result |
|
230 | 196 |
end |
231 | 197 |
end |
232 | 198 |
else |
233 |
- error "Failed: #{response.inspect}" |
|
199 |
+ raise "Failed: #{response.inspect}" |
|
234 | 200 |
end |
235 | 201 |
end |
202 |
+ rescue => e |
|
203 |
+ error e.message |
|
236 | 204 |
end |
237 | 205 |
|
238 | 206 |
def receive(incoming_events) |
@@ -266,7 +234,7 @@ module Agents |
||
266 | 234 |
old_event.expires_at = new_event_expiration_date |
267 | 235 |
old_event.save! |
268 | 236 |
return false |
269 |
- end |
|
237 |
+ end |
|
270 | 238 |
end |
271 | 239 |
return true |
272 | 240 |
end |
@@ -305,27 +273,81 @@ module Agents |
||
305 | 273 |
end).to_s |
306 | 274 |
end |
307 | 275 |
|
276 |
+ def extract_each(doc, &block) |
|
277 |
+ interpolated['extract'].each_with_object({}) { |(name, extraction_details), output| |
|
278 |
+ output[name] = block.call(extraction_details) |
|
279 |
+ } |
|
280 |
+ end |
|
281 |
+ |
|
282 |
+ def extract_json(doc) |
|
283 |
+ extract_each(doc) { |extraction_details| |
|
284 |
+ result = Utils.values_at(doc, extraction_details['path']) |
|
285 |
+ log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}" |
|
286 |
+ result |
|
287 |
+ } |
|
288 |
+ end |
|
289 |
+ |
|
290 |
+ def extract_text(doc) |
|
291 |
+ extract_each(doc) { |extraction_details| |
|
292 |
+ regexp = Regexp.new(extraction_details['regexp']) |
|
293 |
+ result = [] |
|
294 |
+ doc.scan(regexp) { |
|
295 |
+ result << Regexp.last_match[extraction_details['index']] |
|
296 |
+ } |
|
297 |
+ log "Extracting #{extraction_type} at #{regexp}: #{result}" |
|
298 |
+ result |
|
299 |
+ } |
|
300 |
+ end |
|
301 |
+ |
|
302 |
+ def extract_xml(doc) |
|
303 |
+ extract_each(doc) { |extraction_details| |
|
304 |
+ case |
|
305 |
+ when css = extraction_details['css'] |
|
306 |
+ nodes = doc.css(css) |
|
307 |
+ when xpath = extraction_details['xpath'] |
|
308 |
+ doc.remove_namespaces! # ignore xmlns, useful when parsing atom feeds |
|
309 |
+ nodes = doc.xpath(xpath) |
|
310 |
+ else |
|
311 |
+ raise '"css" or "xpath" is required for HTML or XML extraction' |
|
312 |
+ end |
|
313 |
+ case nodes |
|
314 |
+ when Nokogiri::XML::NodeSet |
|
315 |
+ result = nodes.map { |node| |
|
316 |
+ case value = node.xpath(extraction_details['value']) |
|
317 |
+ when Float |
|
318 |
+ # Node#xpath() returns any numeric value as float; |
|
319 |
+ # convert it to integer as appropriate. |
|
320 |
+ value = value.to_i if value.to_i == value |
|
321 |
+ end |
|
322 |
+ value.to_s |
|
323 |
+ } |
|
324 |
+ else |
|
325 |
+ raise "The result of HTML/XML extraction was not a NodeSet" |
|
326 |
+ end |
|
327 |
+ log "Extracting #{extraction_type} at #{xpath || css}: #{result}" |
|
328 |
+ result |
|
329 |
+ } |
|
330 |
+ end |
|
331 |
+ |
|
308 | 332 |
def parse(data) |
309 | 333 |
case extraction_type |
310 |
- when "xml" |
|
311 |
- Nokogiri::XML(data) |
|
312 |
- when "json" |
|
313 |
- JSON.parse(data) |
|
314 |
- when "html" |
|
315 |
- Nokogiri::HTML(data) |
|
316 |
- when "text" |
|
317 |
- data |
|
318 |
- else |
|
319 |
- raise "Unknown extraction type #{extraction_type}" |
|
334 |
+ when "xml" |
|
335 |
+ Nokogiri::XML(data) |
|
336 |
+ when "json" |
|
337 |
+ JSON.parse(data) |
|
338 |
+ when "html" |
|
339 |
+ Nokogiri::HTML(data) |
|
340 |
+ when "text" |
|
341 |
+ data |
|
342 |
+ else |
|
343 |
+ raise "Unknown extraction type #{extraction_type}" |
|
320 | 344 |
end |
321 | 345 |
end |
322 | 346 |
|
323 | 347 |
def is_positive_integer?(value) |
324 |
- begin |
|
325 |
- Integer(value) >= 0 |
|
326 |
- rescue |
|
327 |
- false |
|
328 |
- end |
|
348 |
+ Integer(value) >= 0 |
|
349 |
+ rescue |
|
350 |
+ false |
|
329 | 351 |
end |
330 | 352 |
end |
331 | 353 |
end |